# -*- coding: utf-8 -*-
'''
Created on 2017年5月20日

@author: ZhuJiahui
'''
import os
import time
import re
from file_utils.file_writer import quick_write_1d_to_text


def extract_news(read_filename, write_filename):
    
    all_news = []
    pattern = re.compile(r'(?<=\s\')([\s\S]*?)(?=\',)')
    with open(read_filename, 'r', encoding='utf-8') as f:
        for each_line in f:
            insert_index = each_line.find('INSERT INTO')
            if (insert_index > -1):
                matches = pattern.findall(each_line)
                if (len(matches) >= 5) and (len(matches[4].strip()) > 50):
                    all_news.append(matches[4].strip())
    
    quick_write_1d_to_text(write_filename, all_news)

def test_extract_news():
    start = time.clock()
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    read_filename = root_directory + 'dataset/tonghuashunnews.sql'
    write_filename = root_directory + 'dataset/all_news.txt'
    extract_news(read_filename, write_filename)
    print('Total time %f seconds' % (time.clock() - start))

if __name__ == '__main__':
    test_extract_news()
